In [1]:
import warnings

# Suppress specific FutureWarning
warnings.simplefilter("ignore", category=FutureWarning)
In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset (Make sure to replace the path with your local dataset path)
df = pd.read_csv("C:/Users/sarva/Desktop/kc_house_data.csv")
df
Out[2]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7129300520 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 ... 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 ... 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 ... 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 ... 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 ... 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
21608 263000018 20140521T000000 360000.0 3 2.50 1530 1131 3.0 0 0 ... 8 1530 0 2009 0 98103 47.6993 -122.346 1530 1509
21609 6600060120 20150223T000000 400000.0 4 2.50 2310 5813 2.0 0 0 ... 8 2310 0 2014 0 98146 47.5107 -122.362 1830 7200
21610 1523300141 20140623T000000 402101.0 2 0.75 1020 1350 2.0 0 0 ... 7 1020 0 2009 0 98144 47.5944 -122.299 1020 2007
21611 291310100 20150116T000000 400000.0 3 2.50 1600 2388 2.0 0 0 ... 8 1600 0 2004 0 98027 47.5345 -122.069 1410 1287
21612 1523300157 20141015T000000 325000.0 2 0.75 1020 1076 2.0 0 0 ... 7 1020 0 2008 0 98144 47.5941 -122.299 1020 1357

21613 rows × 21 columns

In [3]:
df.head()
Out[3]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7129300520 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 ... 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 ... 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 ... 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 ... 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 ... 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503

5 rows × 21 columns

In [4]:
df.tail()
Out[4]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
21608 263000018 20140521T000000 360000.0 3 2.50 1530 1131 3.0 0 0 ... 8 1530 0 2009 0 98103 47.6993 -122.346 1530 1509
21609 6600060120 20150223T000000 400000.0 4 2.50 2310 5813 2.0 0 0 ... 8 2310 0 2014 0 98146 47.5107 -122.362 1830 7200
21610 1523300141 20140623T000000 402101.0 2 0.75 1020 1350 2.0 0 0 ... 7 1020 0 2009 0 98144 47.5944 -122.299 1020 2007
21611 291310100 20150116T000000 400000.0 3 2.50 1600 2388 2.0 0 0 ... 8 1600 0 2004 0 98027 47.5345 -122.069 1410 1287
21612 1523300157 20141015T000000 325000.0 2 0.75 1020 1076 2.0 0 0 ... 7 1020 0 2008 0 98144 47.5941 -122.299 1020 1357

5 rows × 21 columns

In [5]:
df.columns
Out[5]:
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')
In [6]:
# Check for null values
df.isnull().sum()
Out[6]:
id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64
In [7]:
# Check for duplicate rows
df.duplicated().sum()
Out[7]:
0
In [8]:
# Check for any missing values in the entire DataFrame
if df.isnull().values.any():
    print("Missing values found in the dataset.")
else:
    print("No missing values found.")
No missing values found.
In [9]:
# Handle missing values (if any)
df.dropna(inplace=True)
In [10]:
# Find duplicate rows in the entire DataFrame
duplicate_rows = df[df.duplicated()].sum()

# Print the duplicate rows
print(duplicate_rows)
id                 0
date               0
price            0.0
bedrooms           0
bathrooms        0.0
sqft_living        0
sqft_lot           0
floors           0.0
waterfront         0
view               0
condition          0
grade              0
sqft_above         0
sqft_basement      0
yr_built           0
yr_renovated       0
zipcode            0
lat              0.0
long             0.0
sqft_living15      0
sqft_lot15         0
dtype: object
In [11]:
# Function to detect outliers using IQR method
def detect_outliers_iqr(df, column):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    
    # Calculate IQR (Interquartile Range)
    IQR = Q3 - Q1
    
    # Define the lower and upper bounds for outliers
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    
    return lower_limit, upper_limit
In [12]:
# Function to apply Winsorization (outlier capping)
def apply_winsorization(df, column, lower_limit, upper_limit):
    # Cap the values outside the lower and upper limits
    df[column] = np.where(df[column] < lower_limit, lower_limit, df[column])
    df[column] = np.where(df[column] > upper_limit, upper_limit, df[column])
    return df
In [13]:
# List of columns to check for outliers
columns_to_check = ['price', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built']

# For each column in columns_to_check, detect outliers and print summary before and after Winsorization
for col in columns_to_check:
    # Calculate the lower and upper limits for outliers
    lower_limit, upper_limit = detect_outliers_iqr(df, col)
    
    # Print summary of the original column
    print(f"Column: {col}")
    print("Before Winsorization (sample):")
    print(df[col].describe())
    
    # Apply Winsorization (Capping outliers)
    df_winsorized = apply_winsorization(df.copy(), col, lower_limit, upper_limit)
    
    # Print summary of the capped column
    print("After Winsorization (sample):")
    print(df_winsorized[col].describe())
    print("-" * 40)
Column: price
Before Winsorization (sample):
count    2.161300e+04
mean     5.400881e+05
std      3.671272e+05
min      7.500000e+04
25%      3.219500e+05
50%      4.500000e+05
75%      6.450000e+05
max      7.700000e+06
Name: price, dtype: float64
After Winsorization (sample):
count    2.161300e+04
mean     5.115873e+05
std      2.500026e+05
min      7.500000e+04
25%      3.219500e+05
50%      4.500000e+05
75%      6.450000e+05
max      1.129575e+06
Name: price, dtype: float64
----------------------------------------
Column: sqft_living
Before Winsorization (sample):
count    21613.000000
mean      2079.899736
std        918.440897
min        290.000000
25%       1427.000000
50%       1910.000000
75%       2550.000000
max      13540.000000
Name: sqft_living, dtype: float64
After Winsorization (sample):
count    21613.000000
mean      2058.078564
std        839.307806
min        290.000000
25%       1427.000000
50%       1910.000000
75%       2550.000000
max       4234.500000
Name: sqft_living, dtype: float64
----------------------------------------
Column: sqft_lot
Before Winsorization (sample):
count    2.161300e+04
mean     1.510697e+04
std      4.142051e+04
min      5.200000e+02
25%      5.040000e+03
50%      7.618000e+03
75%      1.068800e+04
max      1.651359e+06
Name: sqft_lot, dtype: float64
After Winsorization (sample):
count    21613.000000
mean      8705.224448
std       5046.482073
min        520.000000
25%       5040.000000
50%       7618.000000
75%      10688.000000
max      19160.000000
Name: sqft_lot, dtype: float64
----------------------------------------
Column: sqft_above
Before Winsorization (sample):
count    21613.000000
mean      1788.390691
std        828.090978
min        290.000000
25%       1190.000000
50%       1560.000000
75%       2210.000000
max       9410.000000
Name: sqft_above, dtype: float64
After Winsorization (sample):
count    21613.000000
mean      1769.563041
std        764.029323
min        290.000000
25%       1190.000000
50%       1560.000000
75%       2210.000000
max       3740.000000
Name: sqft_above, dtype: float64
----------------------------------------
Column: sqft_basement
Before Winsorization (sample):
count    21613.000000
mean       291.509045
std        442.575043
min          0.000000
25%          0.000000
50%          0.000000
75%        560.000000
max       4820.000000
Name: sqft_basement, dtype: float64
After Winsorization (sample):
count    21613.000000
mean       284.050155
std        417.064559
min          0.000000
25%          0.000000
50%          0.000000
75%        560.000000
max       1400.000000
Name: sqft_basement, dtype: float64
----------------------------------------
Column: yr_built
Before Winsorization (sample):
count    21613.000000
mean      1971.005136
std         29.373411
min       1900.000000
25%       1951.000000
50%       1975.000000
75%       1997.000000
max       2015.000000
Name: yr_built, dtype: float64
After Winsorization (sample):
count    21613.000000
mean      1971.005136
std         29.373411
min       1900.000000
25%       1951.000000
50%       1975.000000
75%       1997.000000
max       2015.000000
Name: yr_built, dtype: float64
----------------------------------------
In [14]:
# Function to plot boxplots before and after outlier treatment (individually)
def plot_individual_boxplots(df, columns):
    for col in columns:
        # Calculate the lower and upper limits for outliers
        lower_limit, upper_limit = detect_outliers_iqr(df, col)
        
        # Plot Before Winsorization (Original Data)
        plt.figure(figsize=(6, 4))
        sns.boxplot(x=df[col])
        plt.title(f'{col} Before Winsorization')
        plt.show()

        # Apply Winsorization (Capping outliers)
        df_winsorized = apply_winsorization(df.copy(), col, lower_limit, upper_limit)

        # Plot After Winsorization (Winsorized Data)
        plt.figure(figsize=(6, 4))
        sns.boxplot(x=df_winsorized[col])
        plt.title(f'{col} After Winsorization')
        plt.show()

# Plot all columns before and after outlier treatment individually
plot_individual_boxplots(df, columns_to_check)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [15]:
#Identify numeric columns
numeric_cols = df.select_dtypes(include=['number']).columns

# Optionally, if you need to convert dates to numeric, you can do this here
# For example, if there's a 'date' column, we can convert it to datetime and then to the number of days or years
# Example for a 'date' column: df['date'] = pd.to_datetime(df['date'], errors='coerce')
# df['date'] = (df['date'] - df['date'].min())  # Convert to the number of days since the earliest date

# Create correlation matrix for only numeric columns
correlation_matrix = df[numeric_cols].corr()

#  Plot correlation matrix heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()
No description has been provided for this image
In [16]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv('kc_house_data.csv')

# Optional: Label Encoding categorical features if necessary
# For example, if 'waterfront' is a string, convert it to numerical
encoder = LabelEncoder()
df['waterfront'] = encoder.fit_transform(df['waterfront'])

Analytical Questions:¶

1. How do house prices vary across different neighborhoods (zipcodes)?¶

In [17]:
fig = px.box(df, x="zipcode", y="price", title="1. Price Variation Across Different Neighborhoods",
             color="zipcode",
             labels={"zipcode": "Neighborhood (Zipcode)", "price": "Price"})
fig.update_layout(showlegend=False)
fig.show()

Analysis:¶

 • The boxplot displays the median, IQR, and outliers for house prices by neighborhood.
 • Larger boxes indicate more price variation, while whiskers show the price range.
 • Outliers point to neighborhoods with extreme price values.

Insights:¶

 • High-price neighborhoods show large IQRs and outliers, while affordable areas have smaller spreads.

2. How does the number of bedrooms affect house prices?¶

In [18]:
fig = px.scatter(
    df, x="bedrooms", y="price", title="2. Price vs. Number of Bedrooms", trendline="ols", color="bedrooms"
)

# Customize layout and update title properties
fig.update_layout(
    title={
        "text": "2. Price vs. Number of Bedrooms",  # Set the title text
        "x": 0.5,  # Center the title horizontally
        "yanchor": "top",  # Align the title to the top
        "font": dict(size=24, family="Arial", color="darkblue"),  # Set font size, family, and color
    },
    xaxis_title="Number of Bedrooms",
    yaxis_title="Price",
)

# Update marker properties
fig.update_traces(marker=dict(size=12, opacity=0.6))

fig.show()

Analysis:¶

• The scatter plot with a trendline reveals the relationship between the number of bedrooms and price.
• The trendline (OLS) indicates a positive correlation, meaning that as the number of bedrooms increases, so does the price.
• Coloring by number of bedrooms helps highlight the variation in prices across different bedroom counts.

Insights:¶

• More bedrooms generally lead to higher house prices.
• Outliers and clusters can reveal specific patterns or neighborhoods where the price increase isn’t as steep.

3. What is the distribution of house prices in the dataset?¶

In [19]:
fig = px.histogram(df, x="price", nbins=50,
                   marginal="box",  # Adding marginal box plot
                   color_discrete_sequence=["#636EFA"])  # Use one color for bins

fig.update_layout( title = ("3. Price Distribution vs Frequency"),
    xaxis_title="Price ($)",
    yaxis_title="Frequency",
    title_x=0.5,  # Center the title
    title_font=dict(size=24, family="Arial", color="darkblue"),  # Match font color and size
    xaxis=dict(showgrid=True, gridcolor='lightgray', zeroline=False),
    yaxis=dict(showgrid=True, gridcolor='lightgray', zeroline=False),
    plot_bgcolor='white',  # White background for a clean presentation
    paper_bgcolor='white',
    font=dict(family="Arial", size=14, color="DarkSlateGrey"),
)

# Show the updated plot
fig.show()

Analysis:¶

• The histogram provides a clear view of house price distribution, with marginal box plots for additional insights into the spread and central tendency.
• The price distribution shows a skewed distribution, indicating most houses fall within a certain price range with a few high-priced outliers.
• The use of the marginal box plot provides information on the median price, interquartile range, and presence of outliers.

Insights:¶

• Most house prices are concentrated around a mid-range value, but the presence of outliers (extremely high-priced homes) skews the distribution.
• The box plot gives insights into how widely prices are dispersed and where the majority of prices lie.

4. What are the relationships between key features in the dataset and how do they correlate with house prices?¶

In [20]:
# Set style for the plot
sns.set(style="whitegrid")

# Define the features (columns) you want to use in the pairplot
features = ['sqft_living', 'bedrooms', 'bathrooms', 'price', 'sqft_lot']  

# Creating the pairplot
sns.pairplot(df[features], 
             hue='price', 
             palette='viridis',  # Use 'viridis' for a smooth, color-blind friendly palette
             plot_kws={'alpha': 0.7, 's': 70, 'edgecolor': 'w'},  # More transparency and styling
             diag_kind='kde',  # Use KDE for the diagonal histograms to show distribution
             height=3)  # Control the size of the plot

# Title and labels
plt.suptitle("4. Exploring Relationships Between Key Features", color="darkblue", fontsize=24)
plt.subplots_adjust(top=0.95)  # Adjust title positioning to avoid overlap with plots

# Show the plot
plt.show()
No description has been provided for this image

Analysis:¶

•  The pairplot visualizes pairwise relationships between multiple variables, with a smooth KDE (Kernel Density Estimate) on the diagonals to show the distribution of each feature.
• The hue based on price allows us to color-code the data points based on house price, revealing how each feature interacts with price.

Insights:¶

• Some features like square footage and number of bedrooms show a stronger relationship with price, while others may be less correlated.
• The KDE diagonal plots provide insights into the distribution of key variables and any skewness or trends within them.

5. How does house condition affect the price distribution across different house conditions?¶

In [21]:
plt.figure(figsize=(12, 6))


df['condition'] = df['condition'].astype('category')

# Creating a violin plot for house condition vs price with 'hue' set to 'condition'
sns.violinplot(data=df, x='condition', y='price', hue='condition', palette='Set2', legend=False)

# Title and labels
plt.title('5. Price Distribution Across Different House Conditions',color = "darkblue", fontsize=18)
plt.xlabel('House Condition', fontsize=14)
plt.ylabel('Price ($)', fontsize=14)

# Improve x-axis labels (no rotation needed)
plt.xticks(fontsize=12)

# Show the plot
plt.show()
No description has been provided for this image

Analysis:¶

• The violin plot provides a detailed view of the price distribution across various house conditions.
• The hue based on condition distinguishes between the conditions (e.g., good, fair, poor), allowing for easy comparison of how each condition affects price.
• The Set2 palette is used for clear visual distinction between different conditions, while violin plot shapes show the distribution and density of prices.

Insights:¶

• Houses with better conditions tend to have a higher price distribution, while those in poorer condition may have a wider range of prices but lower median values.
• The plot's distribution and density provide deeper insights into variability within each house condition.

6. How does the size (square footage) of a house affect its price?¶

In [22]:
import plotly.express as px

# Scatter plot with a trendline for square footage vs price
fig = px.scatter(df, x='sqft_living', y='price', 
                 trendline="ols",  # Adding a trendline to show correlation
                 title='6. Price vs Square Footage of House',
                 labels={'sqft_living': 'Square Footage (sqft)', 'price': 'Price ($)'},
                 color='sqft_living',  # Color points by sqft_living
                 color_continuous_scale='Viridis',  # Color scale for better visual appeal
                 template='plotly')  # Use default light template for white background

# Update layout for cleaner presentation
fig.update_layout(
    xaxis_title="Square Footage (sqft)",
    yaxis_title="Price ($)",
    plot_bgcolor='white',  # Set white background for the plot area
    title_x=0.5,  # Center the title
    title_font=dict(size=20, color='darkblue'),
    font=dict(size=14, color='black'),
    xaxis=dict(showgrid=True, gridcolor='lightgray'),
    yaxis=dict(showgrid=True, gridcolor='lightgray')
)

fig.show()

Analysis:¶

• This scatter plot helps to visualize how larger homes tend to have higher prices.
• The trendline provides a clear indication of the positive correlation between square footage and price.

Insights:¶

• Positive Correlation: There’s a clear positive relationship between square footage and price, meaning larger homes are typically more expensive.
• Outliers: There might be some outliers where small homes are priced higher or large homes are priced lower, possibly due to factors like location, condition, or renovations.
• Data Distribution: The color gradient shows a range of square footage values, making it easier to see how larger homes are distributed across the price range.

7. How does the year of construction affect house prices?¶

In [23]:
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x="yr_built", y="price", hue="yr_built", palette="viridis", edgecolor='w', s=100)

plt.title("7. Price Distribution by Year of Construction", color = 'darkblue', fontsize=16)
plt.xlabel("Year Built", fontsize=12)
plt.ylabel("Price ($)", fontsize=12)
plt.legend(title='Year Built', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
No description has been provided for this image

Analysis:¶

• Scatter plot shows the relationship between year built and price.
• Color gradient helps differentiate houses by their construction year.
• Modern homes tend to have higher prices due to newer amenities, while older homes are generally priced lower unless renovated.

Insights:¶

• Newer homes generally fetch higher prices.
• Older homes may be priced lower, with some exceptions (luxury or renovated properties).

8. How does the average house price vary across different zip codes and with respect to the number of bedrooms?¶

In [24]:
import plotly.express as px

# Calculate average price per zipcode and per number of bedrooms
avg_price_data = df.groupby(['zipcode', 'bedrooms'], as_index=False)['price'].mean()

# Create a bar plot with price comparison across zipcodes and number of bedrooms
fig = px.bar(avg_price_data, 
             x="zipcode", 
             y="price", 
             color="bedrooms", 
             title="8. Average Price by Zipcode and Number of Bedrooms", 
             labels={"zipcode": "Neighborhood (Zipcode)", "price": "Average Price ($)", "bedrooms": "Number of Bedrooms"},
             color_continuous_scale='Viridis',  # Color scale for bedrooms
             barmode='group'  # Group bars by number of bedrooms
            )

fig.update_layout(
    xaxis_title="Zipcode",
    yaxis_title="Average Price ($)",
    title_x=0.5,  # Center the title
    title_font=dict(size=20, color='darkblue'),
    xaxis=dict(showgrid=True, gridcolor='lightgray', tickangle=0),  
    yaxis=dict(showgrid=True, gridcolor='lightgray'),
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family="Arial", size=14, color="black"),
    showlegend=True
)

fig.show()

Analysis:¶

• The bar plot shows how the average price of homes varies by zip code and the number of bedrooms.
• Different colors in the bars represent various bedroom counts, highlighting their impact on home prices across zip codes.
• It helps identify areas where homes with more bedrooms have significantly higher prices.

Insights:¶

• Higher Bedroom Count = Higher Price: Homes with more bedrooms generally have higher average prices.
• Geographical Influence: Certain zip codes, likely more affluent or desirable areas, consistently show higher home prices.
• Price Variation: Some zip codes show significant price variation, suggesting factors like location and amenities play a major role.
• Market Trends: The relationship between price and number of bedrooms indicates demand for larger homes in specific areas.

9. How does proximity to a waterfront affect house prices?¶

In [25]:
plt.figure(figsize=(10, 6))

# Creatimg a strip plot with hue based on 'waterfront'
sns.stripplot(x="waterfront", y="price", hue="waterfront", data=df, jitter=True, palette="Spectral", size=8, alpha=0.6)

# Title and labels
plt.title('9. Price Comparison Based on Proximity to Waterfront', color = 'darkblue',fontsize=18)
plt.xlabel('Waterfront (0: No, 1: Yes)', fontsize=14)
plt.ylabel('Price ($)', fontsize=14)
plt.xticks([0, 1], ['No', 'Yes'])
plt.legend(title='Waterfront', loc='upper left', bbox_to_anchor=(1.05, 1))  # Add legend

plt.show()
No description has been provided for this image
In [ ]:
 

Analysis:¶

• Waterfront houses (labeled as 'Yes') typically have higher prices compared to non-waterfront houses (labeled 'No').
• The plot shows variability in prices, with both waterfront and non-waterfront houses exhibiting a range of values.

Insights:¶

• Waterfront properties generally command higher prices.
• Price range for waterfront homes is wider, showing both affordable and premium waterfront houses.
• Non-waterfront properties tend to have a lower average price, but can still vary based on other features.

10. How does the number of floors in a house impact its price?¶

In [26]:
# Pivot data for heatmap visualization (using string 'mean' for aggregation)
floor_price_data = df.pivot_table(index="floors", values="price", aggfunc="mean")

plt.figure(figsize=(10, 6))
sns.heatmap(floor_price_data, annot=True, cmap="YlGnBu", cbar_kws={'label': 'Price ($)'}, linewidths=0.5)

plt.title("10. Price Variation by Number of Floors", color = 'darkblue', fontsize=16)
plt.xlabel("Number of Floors", fontsize=12)
plt.ylabel("Price ($)", fontsize=12)
plt.show()
No description has been provided for this image

Analysis:¶

• A heatmap is used to visualize the average price variation across different number of floors in the dataset.
• The pivot table shows the mean price for each number of floors, helping to identify the relationship between floor count and price.
• The color scale indicates price range, with darker shades corresponding to higher prices.

Insights:¶

• Houses with more floors tend to have higher average prices, though the variation between floors is minimal in some cases.
• Single-floor homes tend to have lower average prices compared to homes with multiple floors.
• The heatmap provides clear visual cues for how floor count is correlated with price.

Machine Learning Part:¶

In [27]:
# Importing necessary libraries for ML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Preprocessing and feature selection
# For simplicity, let's use some important columns from the dataset
features = ['sqft_living', 'bedrooms', 'bathrooms', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'yr_built']

# Selecting X (features) and y (target variable)
X = df[features]
y = df['price']

# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initializing the Linear Regression model
model = LinearRegression()

# Training the model
model.fit(X_train_scaled, y_train)

# Predictions on test data
y_pred = model.predict(X_test_scaled)

# Model evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output the evaluation metrics
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared: {r2:.2f}")

# Visualizing the predictions vs actual prices in a colorful way
plt.figure(figsize=(10, 6))

# Scatter plot with color representing the predicted values
plt.scatter(y_test, y_pred, c=y_pred, cmap='viridis', alpha=0.7, edgecolor='w', s=80)

# Plotting the ideal prediction line (where predicted = actual)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Ideal Prediction Line')

# Title and labels
plt.title("Predicted vs Actual Prices ", color = 'darkblue' , fontsize= 24)
plt.xlabel("Actual Prices ($)", fontsize=12)
plt.ylabel("Predicted Prices ($)", fontsize=12)

# Adding a color bar to show the range of predicted values
plt.colorbar(label='Predicted Price')

# Adding grid for better readability
plt.grid(True, which='both', linestyle='--', linewidth=0.5)

# Show the plot
plt.legend(loc='upper left')
plt.show()
Mean Squared Error (MSE): 52585547066.12
R-squared: 0.65
No description has been provided for this image
In [ ]: